In [15]:
# First let's import the packages we will use in this project
# You can do this all now or as you need them
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None



# Now we need to read in the data
df = pd.read_csv(r'C:\Users\Heng Kimhak\Downloads\movies.csv')
In [16]:
# Let's lookk at the data
df.head()
Out[16]:
name rating genre year released score votes director writer star country budget gross company runtime
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
In [17]:
# We need to see if we have any missing data
# Let's loop through the data and see if there is anything missing
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0%
rating - 1%
genre - 0%
year - 0%
released - 0%
score - 0%
votes - 0%
director - 0%
writer - 0%
star - 0%
country - 0%
budget - 28%
gross - 2%
company - 0%
runtime - 0%
In [18]:
# Data types for our columns
df.dtypes
Out[18]:
name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object
In [19]:
# Change data type of columns
# df['budget'] = df['budget'].astype('Int64')
# df['gross'] = df['gross'].astype('Int64')
# df.dtypes
In [20]:
# Create correct Year column
df['yearcorrect'] = df['released'].astype(str).str[:4]
In [21]:
df.head()
Out[21]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 June
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 July
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 June
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 July
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 July
In [22]:
df[["1", "2"]] = df["released"].str.split(",", expand = True)
In [23]:
df.head()
Out[23]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 June June 13 1980 (United States)
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 July July 2 1980 (United States)
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 June June 20 1980 (United States)
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 July July 2 1980 (United States)
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 July July 25 1980 (United States)
In [24]:
df['2'] = df['2'].astype(str).str[1:5]
In [25]:
df.head()
Out[25]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 June June 13 1980
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 July July 2 1980
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 June June 20 1980
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 July July 2 1980
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 July July 25 1980
In [26]:
df['yearcorrect'] = df['2']
In [27]:
df.head()
Out[27]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 1980 June 13 1980
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 1980 July 2 1980
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 1980 June 20 1980
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 1980 July 2 1980
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 1980 July 25 1980
In [28]:
# df.pop('1')
# df.pop('2')
In [29]:
df.head()
Out[29]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 1980 June 13 1980
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 1980 July 2 1980
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 1980 June 20 1980
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 1980 July 2 1980
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 1980 July 25 1980
In [30]:
df.sort_values(by=['gross'], inplace=False, ascending=False)
Out[30]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
5445 Avatar PG-13 Action 2009 December 18, 2009 (United States) 7.8 1100000.0 James Cameron James Cameron Sam Worthington United States 237000000.0 2.847246e+09 Twentieth Century Fox 162.0 2009 December 18 2009
7445 Avengers: Endgame PG-13 Action 2019 April 26, 2019 (United States) 8.4 903000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 356000000.0 2.797501e+09 Marvel Studios 181.0 2019 April 26 2019
3045 Titanic PG-13 Drama 1997 December 19, 1997 (United States) 7.8 1100000.0 James Cameron James Cameron Leonardo DiCaprio United States 200000000.0 2.201647e+09 Twentieth Century Fox 194.0 1997 December 19 1997
6663 Star Wars: Episode VII - The Force Awakens PG-13 Action 2015 December 18, 2015 (United States) 7.8 876000.0 J.J. Abrams Lawrence Kasdan Daisy Ridley United States 245000000.0 2.069522e+09 Lucasfilm 138.0 2015 December 18 2015
7244 Avengers: Infinity War PG-13 Action 2018 April 27, 2018 (United States) 8.4 897000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 321000000.0 2.048360e+09 Marvel Studios 149.0 2018 April 27 2018
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7663 More to Life NaN Drama 2020 October 23, 2020 (United States) 3.1 18.0 Joseph Ebanks Joseph Ebanks Shannon Bond United States 7000.0 NaN NaN 90.0 2020 October 23 2020
7664 Dream Round NaN Comedy 2020 February 7, 2020 (United States) 4.7 36.0 Dusty Dukatz Lisa Huston Michael Saquella United States NaN NaN Cactus Blue Entertainment 90.0 2020 February 7 2020
7665 Saving Mbango NaN Drama 2020 April 27, 2020 (Cameroon) 5.7 29.0 Nkanya Nkwai Lynno Lovert Onyama Laura United States 58750.0 NaN Embi Productions NaN 2020 April 27 2020
7666 It's Just Us NaN Drama 2020 October 1, 2020 (United States) NaN NaN James Randall James Randall Christina Roz United States 15000.0 NaN NaN 120.0 2020 October 1 2020
7667 Tee em el NaN Horror 2020 August 19, 2020 (United States) 5.7 7.0 Pereko Mosia Pereko Mosia Siyabonga Mabaso South Africa NaN NaN PK 65 Films 102.0 2020 August 19 2020

7668 rows × 18 columns

In [31]:
pd.set_option('display.max_rows', None)
In [43]:
# Drop any duplicates
df['company'].drop_duplicates().sort_values(ascending=False).head()
Out[43]:
7129                     thefyzz
5664                 micro_scope
6412    iDeal Partners Film Fund
4007                    i5 Films
6793                  i am OTHER
Name: company, dtype: object
In [44]:
df['company'].sort_values(ascending=False).head()
Out[44]:
7129                     thefyzz
5664                 micro_scope
6412    iDeal Partners Film Fund
4007                    i5 Films
6793                  i am OTHER
Name: company, dtype: object
In [34]:
df.head()
Out[34]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 1980 June 13 1980
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 1980 July 2 1980
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 1980 June 20 1980
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 1980 July 2 1980
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 1980 July 25 1980
In [35]:
# Budget high correlation 
# Company high correlation
In [36]:
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Films')
plt.show()
In [37]:
df.head()
Out[37]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 1980 June 13 1980
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 1980 July 2 1980
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 1980 June 20 1980
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 1980 July 2 1980
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 1980 July 25 1980
In [38]:
# Plot budget vs gross using seaborn
sns.regplot(x='budget', y='gross', data=df, scatter_kws={"color": "red"}, line_kws={"color": "blue"})
Out[38]:
<AxesSubplot:xlabel='budget', ylabel='gross'>
In [39]:
# Let's start looking at correlation
df.corr(method='pearson') #pearson, kendall, spearman
Out[39]:
year score votes budget gross runtime
year 1.000000 0.097995 0.222945 0.329321 0.257486 0.120811
score 0.097995 1.000000 0.409182 0.076254 0.186258 0.399451
votes 0.222945 0.409182 1.000000 0.442429 0.630757 0.309212
budget 0.329321 0.076254 0.442429 1.000000 0.740395 0.320447
gross 0.257486 0.186258 0.630757 0.740395 1.000000 0.245216
runtime 0.120811 0.399451 0.309212 0.320447 0.245216 1.000000
In [40]:
# High correlation between budget and gross
correlation_matrix = df.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlatoin Matric for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
In [41]:
# Looks at Company
df.head()
Out[41]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0 1980 June 13 1980
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0 1980 July 2 1980
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0 1980 June 20 1980
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0 1980 July 2 1980
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0 1980 July 25 1980
In [46]:
df_numerized = df
for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized.head()
Out[46]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
0 6587 6 6 1980 1705 8.4 927000.0 2589 4014 1047 54 19000000.0 46998772.0 2319 146.0 0 212 0
1 5573 6 1 1980 1492 5.8 65000.0 2269 1632 327 55 4500000.0 58853106.0 731 104.0 0 188 0
2 5142 4 0 1980 1771 8.7 1200000.0 1111 2567 1745 55 18000000.0 538375067.0 1540 124.0 0 223 0
3 286 4 4 1980 1492 7.7 221000.0 1301 2000 2246 55 3500000.0 83453539.0 1812 88.0 0 188 0
4 1027 6 4 1980 1543 7.3 108000.0 1054 521 410 55 6000000.0 39846344.0 1777 98.0 0 194 0
In [47]:
correlation_matrix = df_numerized.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlatoin Matric for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
In [48]:
df_numerized.corr()
Out[48]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect 1 2
name 1.000000 -0.008069 0.016355 0.011453 -0.011311 0.017097 0.013088 0.009079 0.009081 0.006472 -0.010737 0.023970 0.005533 0.009211 0.010392 0.012875 -0.011412 0.012875
rating -0.008069 1.000000 0.072423 0.008779 0.016613 -0.001314 0.033225 0.019483 -0.005921 0.013405 0.081244 -0.176002 -0.107339 -0.032943 0.062145 0.009359 0.016390 0.009359
genre 0.016355 0.072423 1.000000 -0.081261 0.029822 0.027965 -0.145307 -0.015258 0.006567 -0.005477 -0.037615 -0.356564 -0.235650 -0.071067 -0.052711 -0.063069 0.030043 -0.063069
year 0.011453 0.008779 -0.081261 1.000000 -0.000695 0.097995 0.222945 -0.020795 -0.008656 -0.027242 -0.070938 0.329321 0.257486 -0.010431 0.120811 0.965667 -0.003741 0.965667
released -0.011311 0.016613 0.029822 -0.000695 1.000000 0.042788 0.016097 -0.001478 -0.002404 0.015777 -0.020427 0.014683 0.001659 -0.010474 0.000868 -0.010732 0.999759 -0.010732
score 0.017097 -0.001314 0.027965 0.097995 0.042788 1.000000 0.409182 0.009559 0.019416 -0.001609 -0.133348 0.076254 0.186258 0.001030 0.399451 0.107146 0.041366 0.107146
votes 0.013088 0.033225 -0.145307 0.222945 0.016097 0.409182 1.000000 0.000260 0.000892 -0.019282 0.073625 0.442429 0.630757 0.133204 0.309212 0.205894 0.015156 0.205894
director 0.009079 0.019483 -0.015258 -0.020795 -0.001478 0.009559 0.000260 1.000000 0.299067 0.039234 0.017490 -0.012272 -0.014441 0.004404 0.017624 -0.022644 -0.001446 -0.022644
writer 0.009081 -0.005921 0.006567 -0.008656 -0.002404 0.019416 0.000892 0.299067 1.000000 0.027245 0.015343 -0.039451 -0.023519 0.005646 -0.003511 -0.010134 -0.002719 -0.010134
star 0.006472 0.013405 -0.005477 -0.027242 0.015777 -0.001609 -0.019282 0.039234 0.027245 1.000000 -0.012998 -0.019589 -0.002717 0.012442 0.010174 -0.031268 0.016043 -0.031268
country -0.010737 0.081244 -0.037615 -0.070938 -0.020427 -0.133348 0.073625 0.017490 0.015343 -0.012998 1.000000 0.054063 0.092129 0.095548 -0.078412 -0.091171 -0.020124 -0.091171
budget 0.023970 -0.176002 -0.356564 0.329321 0.014683 0.076254 0.442429 -0.012272 -0.039451 -0.019589 0.054063 1.000000 0.740395 0.173214 0.320447 0.314986 0.013792 0.314986
gross 0.005533 -0.107339 -0.235650 0.257486 0.001659 0.186258 0.630757 -0.014441 -0.023519 -0.002717 0.092129 0.740395 1.000000 0.154840 0.245216 0.240118 0.000973 0.240118
company 0.009211 -0.032943 -0.071067 -0.010431 -0.010474 0.001030 0.133204 0.004404 0.005646 0.012442 0.095548 0.173214 0.154840 1.000000 0.034402 -0.026825 -0.010071 -0.026825
runtime 0.010392 0.062145 -0.052711 0.120811 0.000868 0.399451 0.309212 0.017624 -0.003511 0.010174 -0.078412 0.320447 0.245216 0.034402 1.000000 0.115024 -0.001234 0.115024
yearcorrect 0.012875 0.009359 -0.063069 0.965667 -0.010732 0.107146 0.205894 -0.022644 -0.010134 -0.031268 -0.091171 0.314986 0.240118 -0.026825 0.115024 1.000000 -0.013700 1.000000
1 -0.011412 0.016390 0.030043 -0.003741 0.999759 0.041366 0.015156 -0.001446 -0.002719 0.016043 -0.020124 0.013792 0.000973 -0.010071 -0.001234 -0.013700 1.000000 -0.013700
2 0.012875 0.009359 -0.063069 0.965667 -0.010732 0.107146 0.205894 -0.022644 -0.010134 -0.031268 -0.091171 0.314986 0.240118 -0.026825 0.115024 1.000000 -0.013700 1.000000
In [50]:
correlation_mat = df_numerized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs.head()
Out[50]:
name  name        1.000000
      rating     -0.008069
      genre       0.016355
      year        0.011453
      released   -0.011311
dtype: float64
In [51]:
sorted_pairs = corr_pairs.sort_values()
sorted_pairs.head()
Out[51]:
budget  genre    -0.356564
genre   budget   -0.356564
        gross    -0.235650
gross   genre    -0.235650
budget  rating   -0.176002
dtype: float64
In [52]:
high_corr = sorted_pairs[(sorted_pairs) > 0.5] 
high_corr.head()
Out[52]:
gross        votes     0.630757
votes        gross     0.630757
budget       gross     0.740395
gross        budget    0.740395
yearcorrect  year      0.965667
dtype: float64
In [ ]:
# Votes and budget have the highest correlation to gross earnings
# Company has low correlations